#Set my seed

set.seed(011099)
#check directory before knit
getwd()
## [1] "/local/workdir/mld253/git_repos/comp_micro_proj"

#Testing timing of Rmd document knitting

start_time <- Sys.time()
start_time
## [1] "2024-03-12 15:33:14 EDT"

#Load libraries

library(pacman)
pacman::p_load(tidyverse, BiocManager, devtools, dada2, 
               phyloseq, patchwork, DT, iNEXT, vegan, ggplot2, cowplot,
               install = FALSE)
library(patchwork)

Goals of this file:

  1. Load in raw sequencing files
  2. Assess quality of raw sequencing files
  3. Filter and trim sequences
  4. Write out filtered and trimmed, higher quality sequences
  5. Evaluate new quality of trimmed sequencing files
  6. Infer Errors on forward and reverse reads
  7. Identify ASVs on forward and reverse reads
  8. Merge forward and reverse ASVs
  9. Generate an ASV count table
  10. Assign taxonomy to ASVs

Output:

  1. ASV count table
  2. Taxonomy table
  3. Sample information that summarizes lost reads

Load data

getwd()
## [1] "/local/workdir/mld253/git_repos/comp_micro_proj"
raw_fastqs_path <- "ncbi_data/fastq_files"
#raw_fastqs_path
#head(list.files(raw_fastqs_path))
#str(list.files(raw_fastqs_path))

forward_reads <- list.files(raw_fastqs_path, pattern="_1.fastq.gz", full.names = TRUE)
#head(forward_reads)
#str(forward_reads)

reverse_reads <- list.files(raw_fastqs_path, pattern="_2.fastq.gz", full.names = TRUE)
#head(reverse_reads)

Evaluate raw sequence quality

Plot 12 random samples of plots

random_samples <- sample(1:length(forward_reads), size=12)
random_samples
##  [1] 72  1 40 57 50 35 65 56 52 44 46 66
forward_raw_plot_12 <- plotQualityProfile(forward_reads[random_samples]) +
  labs(title="Forward Reads Raw Quality")

reverse_raw_plot_12 <- plotQualityProfile(reverse_reads[random_samples]) +
  labs(title="Reverse Reads Raw Quality")


#ggplot2/patchwork  was having difficulty combining plots, I am not entirely certain why. chatGPT suggested installing cowplot. and it worked better
library(cowplot)


# Combine plots using cowplot
combined_plots <- plot_grid(forward_raw_plot_12, reverse_raw_plot_12, ncol = 2)

# Display the combined plots
print(combined_plots)

Aggregated raw quality plots

forward_raw_aggregated_plot <- plotQualityProfile(forward_reads, aggregate = TRUE) + labs(title="Aggregated Raw Fwd Reads")

reverse_raw_aggregated_plot <- plotQualityProfile(reverse_reads, aggregate=TRUE) + labs(title= "Aggregated Raw Rev Reads")

plot_grid(forward_raw_aggregated_plot, reverse_raw_aggregated_plot, ncol=2)

preqc_aggregated_plot <- plot_grid(forward_raw_aggregated_plot, reverse_raw_aggregated_plot, ncol=2)

There is a steep drop off in quality near the end of each reads, as expected. The reverse reads are also generally lower in quality, but this was also expected.

There is also a small number of bases at the very beginning of each read that has a consistently low quality. This needs to be trimmed out.

Some of the ends of the raw reads also have uncomfortably low quality.. around even a quality score of 10. We will see what happens further downstream with the analysis!

Prepare a placeholder for filtered reads

samples <- sapply(strsplit(basename(forward_reads), "_"), `[`,1)
head(samples)
## [1] "SRR19625066" "SRR19625067" "SRR19625068" "SRR19625069" "SRR19625070"
## [6] "SRR19625071"
filtered_fastqs_path <- "ncbi_data/filtered_fastqs"
filtered_fastqs_path
## [1] "ncbi_data/filtered_fastqs"
filtered_forward_reads <-file.path(filtered_fastqs_path, paste0(samples,"_R1_filtered.fastq.gz"))
length(filtered_forward_reads)
## [1] 72
filtered_reverse_reads <-file.path(filtered_fastqs_path, paste0(samples, "_R2_filtered.fastq.gz"))
length(filtered_reverse_reads)
## [1] 72
head(filtered_reverse_reads)
## [1] "ncbi_data/filtered_fastqs/SRR19625066_R2_filtered.fastq.gz"
## [2] "ncbi_data/filtered_fastqs/SRR19625067_R2_filtered.fastq.gz"
## [3] "ncbi_data/filtered_fastqs/SRR19625068_R2_filtered.fastq.gz"
## [4] "ncbi_data/filtered_fastqs/SRR19625069_R2_filtered.fastq.gz"
## [5] "ncbi_data/filtered_fastqs/SRR19625070_R2_filtered.fastq.gz"
## [6] "ncbi_data/filtered_fastqs/SRR19625071_R2_filtered.fastq.gz"

Filter and Trim Reads

First trimming runthrough, with “tight” trimming parameters for this data

filtered_reads <- filterAndTrim(fwd=forward_reads,
              filt=filtered_forward_reads, 
              rev = reverse_reads,
              filt.rev=filtered_reverse_reads, 
              maxN=0, 
              maxEE=c(2,2), 
              trimLeft=c(19,20), 
              truncQ=2,
              rm.phix = TRUE, 
              compress=TRUE,
              multithread=TRUE)

Viewing trimmed read quality!

forward_filteredQual_plot_12 <- 
 plotQualityProfile(filtered_forward_reads[random_samples]) + 
  labs(title = "Trimmed Forward Read Quality")

reverse_filteredQual_plot_12 <- 
  plotQualityProfile(filtered_reverse_reads[random_samples]) + 
  labs(title = "Trimmed Reverse Read Quality")


# Put the two plots together , using cowplot
plot_grid(forward_filteredQual_plot_12, reverse_filteredQual_plot_12, ncol=2)

#Aggregate Trimmed Plots
forward_QC_plot <- plotQualityProfile(filtered_forward_reads, aggregate=TRUE) + labs(title="Filtered Aggregated Fwd Reads")

reverse_QC_plot <- plotQualityProfile(filtered_reverse_reads,aggregate=TRUE) + labs(title="Filtered Aggregated Rev Reads")

#Plot together with cowplot again
plot_grid(forward_QC_plot, reverse_QC_plot, ncol=2)

Steep quality dropoff at the beginning of the reads is gone - successfully trimmed start of the reads, and primer sequences are removed

Read output stats

filtered_df <- as.data.frame(filtered_reads)
head(filtered_df)
##                        reads.in reads.out
## SRR19625066_1.fastq.gz    77543     33811
## SRR19625067_1.fastq.gz    73819     30748
## SRR19625068_1.fastq.gz   248353     99890
## SRR19625069_1.fastq.gz   169479     67389
## SRR19625070_1.fastq.gz   191391     86342
## SRR19625071_1.fastq.gz   131759     58224
filtered_df %>% 
  reframe(median_reads_in = median(reads.in),
          median_reads_out = median(reads.out),
          median_percent_retained=(median(reads.out)/median(reads.in)))
##   median_reads_in median_reads_out median_percent_retained
## 1          119394          49787.5               0.4170017

That filtered out a LARGE NUMBER of reads!! Only 41% retained! Maybe further quality filtering is not necessary!!

I will try again with slightly lower error filtering maxEE=2,3 so that we can compare both outputs.

Trimming with lower error filtering

#Filtered reads path placeholder
filtered_fastqs_2_path <- "ncbi_data/filtered_fastqs_lower_threshold"

#Filtered reads vectors
filtered_forward_reads_2 <-file.path(filtered_fastqs_2_path, paste0(samples,"_R1_filtered.fastq.gz"))
length(filtered_forward_reads)
## [1] 72
filtered_reverse_reads_2 <-file.path(filtered_fastqs_2_path, paste0(samples, "_R2_filtered.fastq.gz"))

#Trimming with lower paramaters
filtered_reads_2 <- filterAndTrim(fwd=forward_reads,
              filt=filtered_forward_reads_2, 
              rev = reverse_reads,
              filt.rev=filtered_reverse_reads_2, 
              maxN=0, 
              maxEE=c(2,3), 
              # Reverse reads were lower in quality - perhaps increasing the expected errors will increase retained reads
              #This is the only parameter that has changed
              trimLeft=c(19,20), 
              truncLen = c(240,220),
              truncQ=2,
              rm.phix = TRUE, 
              compress=TRUE,
              multithread=TRUE)

Assessing Second Trimming Read Quality

forward_filtered_qual_2 <- plotQualityProfile(filtered_forward_reads_2[random_samples]) + labs(title="Trimmed Forward Read Quality, Higher Error Rate")

reverse_filtered_qual_2 <- plotQualityProfile(filtered_reverse_reads_2[random_samples]) + labs(title="Trimmed Reverse Read Quality, Higher Error")



#Plot together with cowplot 
plot_grid(forward_filtered_qual_2, reverse_filtered_qual_2, ncol=2)

#Aggregated Trimmed Plots 2
forward_QC_plot_2 <-plotQualityProfile(filtered_forward_reads_2, aggregate=TRUE) + labs(title="Filtered Aggregated Forward Reads")

reverse_QC_plot_2 <- plotQualityProfile(filtered_reverse_reads_2, aggregate=TRUE) + labs(title="Filtered Aggregated Reverse Reads")

#View the plots
postqc_aggregated_plot <- plot_grid(forward_QC_plot_2, reverse_QC_plot_2, ncol=2)
postqc_aggregated_plot

Second trimming round output stats

filtered_df <- as.data.frame(filtered_reads_2)
head(filtered_df)
##                        reads.in reads.out
## SRR19625066_1.fastq.gz    77543     71046
## SRR19625067_1.fastq.gz    73819     66972
## SRR19625068_1.fastq.gz   248353    225428
## SRR19625069_1.fastq.gz   169479    152947
## SRR19625070_1.fastq.gz   191391    175990
## SRR19625071_1.fastq.gz   131759    120852
filtered_df %>% 
  reframe(median_reads_in = median(reads.in),
          median_reads_out = median(reads.out),
          median_percent_retained=(median(reads.out)/median(reads.in)))
##   median_reads_in median_reads_out median_percent_retained
## 1          119394           108216               0.9063772

Lower error parameters: Higher percent retained (59% vs 41% retained) and quality graph looks generally the same.

Will move forward with 59% retained reads, because there is some degree of error correction during dada2 pipeline. May revisit later and look at what happens with NO trimming step at all.

QC visualizations

plot_grid(forward_raw_aggregated_plot, 
          reverse_raw_aggregated_plot, 
          forward_QC_plot_2, 
          reverse_QC_plot_2, 
          align="v",
          nrow=2)

Error modeling

error_forward_reads <- 
  learnErrors(filtered_forward_reads_2, multithread=TRUE)
## 114122853 total bases in 516393 reads from 4 samples will be used for learning the error rates.
error_reverse_reads <- 
  learnErrors(filtered_reverse_reads_2, multithread=TRUE)
## 103278600 total bases in 516393 reads from 4 samples will be used for learning the error rates.
forward_error_plot <- plotErrors(error_forward_reads, nominalQ=TRUE) +
  labs(title="Forward Reads Error Model")

reverse_error_plot <- plotErrors(error_reverse_reads, nominalQ=TRUE) +
  labs(title="Reverse Reads Error Model")

plot_grid(forward_error_plot,reverse_error_plot, ncol=2)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.

I find it difficult to interpret this, but generally error rates drop with increased Phred score.

Infer ASVs

#forward and reverse sequences
dada_forward <-dada(filtered_forward_reads_2,
                    err=error_forward_reads, 
                    multithread=TRUE)
## Sample 1 - 71046 reads in 35220 unique sequences.
## Sample 2 - 66972 reads in 29411 unique sequences.
## Sample 3 - 225428 reads in 80179 unique sequences.
## Sample 4 - 152947 reads in 56234 unique sequences.
## Sample 5 - 175990 reads in 64914 unique sequences.
## Sample 6 - 120852 reads in 48013 unique sequences.
## Sample 7 - 158938 reads in 72618 unique sequences.
## Sample 8 - 103350 reads in 49815 unique sequences.
## Sample 9 - 85364 reads in 40509 unique sequences.
## Sample 10 - 150286 reads in 60942 unique sequences.
## Sample 11 - 162646 reads in 60577 unique sequences.
## Sample 12 - 104183 reads in 41466 unique sequences.
## Sample 13 - 144019 reads in 56623 unique sequences.
## Sample 14 - 64744 reads in 27776 unique sequences.
## Sample 15 - 55649 reads in 25435 unique sequences.
## Sample 16 - 136356 reads in 56766 unique sequences.
## Sample 17 - 166271 reads in 62123 unique sequences.
## Sample 18 - 71408 reads in 30684 unique sequences.
## Sample 19 - 197771 reads in 64990 unique sequences.
## Sample 20 - 78170 reads in 34665 unique sequences.
## Sample 21 - 142121 reads in 51328 unique sequences.
## Sample 22 - 123771 reads in 50591 unique sequences.
## Sample 23 - 96872 reads in 48131 unique sequences.
## Sample 24 - 116031 reads in 43951 unique sequences.
## Sample 25 - 116883 reads in 51168 unique sequences.
## Sample 26 - 91426 reads in 41789 unique sequences.
## Sample 27 - 95720 reads in 39510 unique sequences.
## Sample 28 - 105899 reads in 44980 unique sequences.
## Sample 29 - 108872 reads in 49441 unique sequences.
## Sample 30 - 118042 reads in 50403 unique sequences.
## Sample 31 - 29016 reads in 14993 unique sequences.
## Sample 32 - 76786 reads in 37822 unique sequences.
## Sample 33 - 66922 reads in 35195 unique sequences.
## Sample 34 - 90321 reads in 41234 unique sequences.
## Sample 35 - 69390 reads in 33345 unique sequences.
## Sample 36 - 36454 reads in 21184 unique sequences.
## Sample 37 - 115333 reads in 50430 unique sequences.
## Sample 38 - 51892 reads in 23402 unique sequences.
## Sample 39 - 166303 reads in 66146 unique sequences.
## Sample 40 - 104965 reads in 41895 unique sequences.
## Sample 41 - 93473 reads in 38456 unique sequences.
## Sample 42 - 98228 reads in 43896 unique sequences.
## Sample 43 - 116561 reads in 51015 unique sequences.
## Sample 44 - 73117 reads in 32359 unique sequences.
## Sample 45 - 181339 reads in 70169 unique sequences.
## Sample 46 - 73644 reads in 35764 unique sequences.
## Sample 47 - 36722 reads in 19412 unique sequences.
## Sample 48 - 151263 reads in 60473 unique sequences.
## Sample 49 - 163537 reads in 65844 unique sequences.
## Sample 50 - 113857 reads in 50862 unique sequences.
## Sample 51 - 126809 reads in 51919 unique sequences.
## Sample 52 - 55466 reads in 24003 unique sequences.
## Sample 53 - 112939 reads in 47733 unique sequences.
## Sample 54 - 85701 reads in 40991 unique sequences.
## Sample 55 - 147059 reads in 62016 unique sequences.
## Sample 56 - 159243 reads in 60742 unique sequences.
## Sample 57 - 88440 reads in 39130 unique sequences.
## Sample 58 - 142925 reads in 62193 unique sequences.
## Sample 59 - 179396 reads in 78724 unique sequences.
## Sample 60 - 99779 reads in 49487 unique sequences.
## Sample 61 - 179152 reads in 75416 unique sequences.
## Sample 62 - 125691 reads in 57791 unique sequences.
## Sample 63 - 31462 reads in 18684 unique sequences.
## Sample 64 - 99430 reads in 46897 unique sequences.
## Sample 65 - 40464 reads in 21590 unique sequences.
## Sample 66 - 367545 reads in 127389 unique sequences.
## Sample 67 - 76115 reads in 41534 unique sequences.
## Sample 68 - 147529 reads in 61949 unique sequences.
## Sample 69 - 88641 reads in 41829 unique sequences.
## Sample 70 - 161364 reads in 65564 unique sequences.
## Sample 71 - 107560 reads in 43150 unique sequences.
## Sample 72 - 197848 reads in 79191 unique sequences.
dada_reverse <- dada(filtered_reverse_reads_2,
                     err=error_reverse_reads,
                     multithread = TRUE)
## Sample 1 - 71046 reads in 42186 unique sequences.
## Sample 2 - 66972 reads in 37031 unique sequences.
## Sample 3 - 225428 reads in 111123 unique sequences.
## Sample 4 - 152947 reads in 76943 unique sequences.
## Sample 5 - 175990 reads in 85296 unique sequences.
## Sample 6 - 120852 reads in 61992 unique sequences.
## Sample 7 - 158938 reads in 89834 unique sequences.
## Sample 8 - 103350 reads in 60011 unique sequences.
## Sample 9 - 85364 reads in 49195 unique sequences.
## Sample 10 - 150286 reads in 80077 unique sequences.
## Sample 11 - 162646 reads in 85514 unique sequences.
## Sample 12 - 104183 reads in 53392 unique sequences.
## Sample 13 - 144019 reads in 73364 unique sequences.
## Sample 14 - 64744 reads in 35680 unique sequences.
## Sample 15 - 55649 reads in 31190 unique sequences.
## Sample 16 - 136356 reads in 72218 unique sequences.
## Sample 17 - 166271 reads in 84133 unique sequences.
## Sample 18 - 71408 reads in 39651 unique sequences.
## Sample 19 - 197771 reads in 89402 unique sequences.
## Sample 20 - 78170 reads in 43396 unique sequences.
## Sample 21 - 142121 reads in 70892 unique sequences.
## Sample 22 - 123771 reads in 66248 unique sequences.
## Sample 23 - 96872 reads in 58535 unique sequences.
## Sample 24 - 116031 reads in 59396 unique sequences.
## Sample 25 - 116883 reads in 63664 unique sequences.
## Sample 26 - 91426 reads in 52026 unique sequences.
## Sample 27 - 95720 reads in 51940 unique sequences.
## Sample 28 - 105899 reads in 57862 unique sequences.
## Sample 29 - 108872 reads in 61317 unique sequences.
## Sample 30 - 118042 reads in 64384 unique sequences.
## Sample 31 - 29016 reads in 18014 unique sequences.
## Sample 32 - 76786 reads in 46115 unique sequences.
## Sample 33 - 66922 reads in 41333 unique sequences.
## Sample 34 - 90321 reads in 51674 unique sequences.
## Sample 35 - 69390 reads in 38955 unique sequences.
## Sample 36 - 36454 reads in 24285 unique sequences.
## Sample 37 - 115333 reads in 63538 unique sequences.
## Sample 38 - 51892 reads in 29131 unique sequences.
## Sample 39 - 166303 reads in 84461 unique sequences.
## Sample 40 - 104965 reads in 55214 unique sequences.
## Sample 41 - 93473 reads in 50062 unique sequences.
## Sample 42 - 98228 reads in 54465 unique sequences.
## Sample 43 - 116561 reads in 64425 unique sequences.
## Sample 44 - 73117 reads in 42204 unique sequences.
## Sample 45 - 181339 reads in 91570 unique sequences.
## Sample 46 - 73644 reads in 42955 unique sequences.
## Sample 47 - 36722 reads in 23512 unique sequences.
## Sample 48 - 151263 reads in 77391 unique sequences.
## Sample 49 - 163537 reads in 84892 unique sequences.
## Sample 50 - 113857 reads in 63773 unique sequences.
## Sample 51 - 126809 reads in 65435 unique sequences.
## Sample 52 - 55466 reads in 29827 unique sequences.
## Sample 53 - 112939 reads in 58442 unique sequences.
## Sample 54 - 85701 reads in 49066 unique sequences.
## Sample 55 - 147059 reads in 80401 unique sequences.
## Sample 56 - 159243 reads in 79253 unique sequences.
## Sample 57 - 88440 reads in 48602 unique sequences.
## Sample 58 - 142925 reads in 77054 unique sequences.
## Sample 59 - 179396 reads in 102495 unique sequences.
## Sample 60 - 99779 reads in 59204 unique sequences.
## Sample 61 - 179152 reads in 95946 unique sequences.
## Sample 62 - 125691 reads in 69446 unique sequences.
## Sample 63 - 31462 reads in 21299 unique sequences.
## Sample 64 - 99430 reads in 58826 unique sequences.
## Sample 65 - 40464 reads in 25288 unique sequences.
## Sample 66 - 367545 reads in 174846 unique sequences.
## Sample 67 - 76115 reads in 48368 unique sequences.
## Sample 68 - 147529 reads in 76770 unique sequences.
## Sample 69 - 88641 reads in 53330 unique sequences.
## Sample 70 - 161364 reads in 84568 unique sequences.
## Sample 71 - 107560 reads in 57671 unique sequences.
## Sample 72 - 197848 reads in 104514 unique sequences.

Merge Forward and Reverse ASVs

merged_asvs <- mergePairs(dada_forward, 
                          filtered_forward_reads_2, 
                          dada_reverse, 
                          filtered_reverse_reads_2, 
                          verbose=TRUE)
## 41654 paired-reads (in 1064 unique pairings) successfully merged out of 61969 (in 3320 pairings) input.
## 42911 paired-reads (in 1130 unique pairings) successfully merged out of 59874 (in 3169 pairings) input.
## 160215 paired-reads (in 4410 unique pairings) successfully merged out of 208220 (in 11870 pairings) input.
## 104690 paired-reads (in 2902 unique pairings) successfully merged out of 140473 (in 8082 pairings) input.
## 124216 paired-reads (in 2799 unique pairings) successfully merged out of 162124 (in 7913 pairings) input.
## 83127 paired-reads (in 2183 unique pairings) successfully merged out of 110011 (in 6324 pairings) input.
## 93671 paired-reads (in 2484 unique pairings) successfully merged out of 142303 (in 8798 pairings) input.
## 61322 paired-reads (in 1683 unique pairings) successfully merged out of 90966 (in 5621 pairings) input.
## 51272 paired-reads (in 1261 unique pairings) successfully merged out of 75737 (in 4605 pairings) input.
## 97420 paired-reads (in 2508 unique pairings) successfully merged out of 136343 (in 7558 pairings) input.
## 116327 paired-reads (in 3267 unique pairings) successfully merged out of 148822 (in 8131 pairings) input.
## 70300 paired-reads (in 1842 unique pairings) successfully merged out of 94525 (in 5290 pairings) input.
## 96080 paired-reads (in 2724 unique pairings) successfully merged out of 130650 (in 7751 pairings) input.
## 42238 paired-reads (in 1190 unique pairings) successfully merged out of 58089 (in 3160 pairings) input.
## 35789 paired-reads (in 1003 unique pairings) successfully merged out of 49335 (in 2829 pairings) input.
## 87449 paired-reads (in 2268 unique pairings) successfully merged out of 122741 (in 6705 pairings) input.
## 114512 paired-reads (in 3171 unique pairings) successfully merged out of 152575 (in 8701 pairings) input.
## 44949 paired-reads (in 1327 unique pairings) successfully merged out of 63777 (in 3536 pairings) input.
## 146446 paired-reads (in 3742 unique pairings) successfully merged out of 183991 (in 9397 pairings) input.
## 48818 paired-reads (in 1321 unique pairings) successfully merged out of 69150 (in 3762 pairings) input.
## 100162 paired-reads (in 2753 unique pairings) successfully merged out of 130463 (in 7250 pairings) input.
## 84398 paired-reads (in 2002 unique pairings) successfully merged out of 112702 (in 6223 pairings) input.
## 54705 paired-reads (in 1448 unique pairings) successfully merged out of 85189 (in 5116 pairings) input.
## 80168 paired-reads (in 2325 unique pairings) successfully merged out of 106309 (in 6020 pairings) input.
## 76434 paired-reads (in 1804 unique pairings) successfully merged out of 105279 (in 6275 pairings) input.
## 55690 paired-reads (in 1471 unique pairings) successfully merged out of 81014 (in 4570 pairings) input.
## 63180 paired-reads (in 1607 unique pairings) successfully merged out of 86947 (in 4296 pairings) input.
## 68970 paired-reads (in 1874 unique pairings) successfully merged out of 95318 (in 5296 pairings) input.
## 68855 paired-reads (in 1675 unique pairings) successfully merged out of 97539 (in 5450 pairings) input.
## 74198 paired-reads (in 2026 unique pairings) successfully merged out of 106394 (in 5981 pairings) input.
## 16780 paired-reads (in 499 unique pairings) successfully merged out of 25071 (in 1273 pairings) input.
## 46005 paired-reads (in 1092 unique pairings) successfully merged out of 67740 (in 3833 pairings) input.
## 35970 paired-reads (in 941 unique pairings) successfully merged out of 58134 (in 3290 pairings) input.
## 55555 paired-reads (in 1487 unique pairings) successfully merged out of 80523 (in 4338 pairings) input.
## 41521 paired-reads (in 883 unique pairings) successfully merged out of 61880 (in 3214 pairings) input.
## 18714 paired-reads (in 458 unique pairings) successfully merged out of 30403 (in 1584 pairings) input.
## 71226 paired-reads (in 1984 unique pairings) successfully merged out of 103509 (in 5978 pairings) input.
## 33777 paired-reads (in 887 unique pairings) successfully merged out of 46103 (in 2282 pairings) input.
## 112255 paired-reads (in 2733 unique pairings) successfully merged out of 152287 (in 8610 pairings) input.
## 69079 paired-reads (in 1954 unique pairings) successfully merged out of 94807 (in 5306 pairings) input.
## 61996 paired-reads (in 1729 unique pairings) successfully merged out of 83864 (in 4682 pairings) input.
## 62702 paired-reads (in 1545 unique pairings) successfully merged out of 87697 (in 4822 pairings) input.
## 73617 paired-reads (in 1979 unique pairings) successfully merged out of 104525 (in 5989 pairings) input.
## 46148 paired-reads (in 1279 unique pairings) successfully merged out of 64714 (in 3424 pairings) input.
## 123470 paired-reads (in 3013 unique pairings) successfully merged out of 166472 (in 9212 pairings) input.
## 44589 paired-reads (in 1159 unique pairings) successfully merged out of 64689 (in 3514 pairings) input.
## 21573 paired-reads (in 584 unique pairings) successfully merged out of 31588 (in 1669 pairings) input.
## 101688 paired-reads (in 2476 unique pairings) successfully merged out of 137564 (in 7443 pairings) input.
## 109388 paired-reads (in 2306 unique pairings) successfully merged out of 149734 (in 7508 pairings) input.
## 70934 paired-reads (in 1880 unique pairings) successfully merged out of 102161 (in 5659 pairings) input.
## 83246 paired-reads (in 2331 unique pairings) successfully merged out of 114961 (in 6645 pairings) input.
## 36675 paired-reads (in 990 unique pairings) successfully merged out of 49577 (in 2471 pairings) input.
## 74082 paired-reads (in 1868 unique pairings) successfully merged out of 101709 (in 5685 pairings) input.
## 51780 paired-reads (in 1289 unique pairings) successfully merged out of 75903 (in 4135 pairings) input.
## 94461 paired-reads (in 2433 unique pairings) successfully merged out of 132799 (in 7464 pairings) input.
## 108641 paired-reads (in 2473 unique pairings) successfully merged out of 145636 (in 7331 pairings) input.
## 55043 paired-reads (in 1396 unique pairings) successfully merged out of 78952 (in 4233 pairings) input.
## 91558 paired-reads (in 2112 unique pairings) successfully merged out of 129192 (in 7106 pairings) input.
## 113519 paired-reads (in 2941 unique pairings) successfully merged out of 161091 (in 9950 pairings) input.
## 56394 paired-reads (in 1478 unique pairings) successfully merged out of 87235 (in 5167 pairings) input.
## 114904 paired-reads (in 3105 unique pairings) successfully merged out of 161651 (in 9499 pairings) input.
## 76950 paired-reads (in 1689 unique pairings) successfully merged out of 111701 (in 6103 pairings) input.
## 15411 paired-reads (in 409 unique pairings) successfully merged out of 25816 (in 1339 pairings) input.
## 59333 paired-reads (in 1552 unique pairings) successfully merged out of 88291 (in 4842 pairings) input.
## 22405 paired-reads (in 620 unique pairings) successfully merged out of 33987 (in 1741 pairings) input.
## 260390 paired-reads (in 6228 unique pairings) successfully merged out of 341354 (in 18461 pairings) input.
## 41002 paired-reads (in 992 unique pairings) successfully merged out of 65539 (in 3548 pairings) input.
## 95910 paired-reads (in 2202 unique pairings) successfully merged out of 133931 (in 7309 pairings) input.
## 53436 paired-reads (in 1362 unique pairings) successfully merged out of 78012 (in 4149 pairings) input.
## 105352 paired-reads (in 2682 unique pairings) successfully merged out of 147213 (in 8727 pairings) input.
## 72001 paired-reads (in 1724 unique pairings) successfully merged out of 97976 (in 4864 pairings) input.
## 137546 paired-reads (in 2876 unique pairings) successfully merged out of 181542 (in 9791 pairings) input.

Create Raw ASV count table

raw_asv_table <- makeSequenceTable(merged_asvs)
dim(raw_asv_table)
## [1]    72 47259
class(raw_asv_table)
## [1] "matrix" "array"

Trim ASVs

#Length of sequences?
table(nchar(getSequences(raw_asv_table)))
## 
##   262   263   264   265   266   267   268   269   270   271   272   273   274 
##    23   108    75   289    54   223   121   131     1     1    35    50    34 
##   275   276   277   278   279   280   281   282   284   285   286   287   288 
##    56   162 41981  3166   122   113    15    24     2     3     8     4     1 
##   289   290   294   297   298   299   300   301   302   304   305   306   307 
##     7     1     1     7     2    64    28     1     4     6     1     1     2 
##   308   309   311   312   316   319   320   326   372   381   382   384   386 
##     5    44   196     1     1     1     1     2     1    20     6     4    14 
##   387   408   409 
##    34     1     1
data.frame(Seq_Length = nchar(getSequences(raw_asv_table))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram(binwidth=5, 
                   breaks = seq(280, 350, by = 2)) + 
  labs(title = "Raw distribution of ASV length")

raw_asv_table_trimmed <- raw_asv_table[,nchar(colnames(raw_asv_table)) %in% 283:300]
  
sum(raw_asv_table_trimmed)/sum(raw_asv_table)
## [1] 0.00305425
data.frame(Seq_Length = nchar(getSequences(raw_asv_table_trimmed))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Trimmed distribution of ASV length")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Most still at 386, 99% reads retained after trim

Remove Chimeras

nochimeras_asv_table <- removeBimeraDenovo(raw_asv_table_trimmed, method="consensus", multithread=TRUE,verbose=TRUE)
## Identified 10 bimeras out of 128 input sequences.
dim(nochimeras_asv_table)
## [1]  72 118
#6732 ASVs retained
sum(nochimeras_asv_table)/sum(raw_asv_table)
## [1] 0.002910345
sum(nochimeras_asv_table)/sum(raw_asv_table_trimmed)
## [1] 0.9528837
#92% of asvs retained, 8% removed from start

Track Read Counts

getN <- function(x) sum(getUniques(x))

track <- cbind(filtered_reads_2, sapply(dada_forward, getN), sapply(dada_reverse, getN), sapply(merged_asvs, getN), rowSums(nochimeras_asv_table))

head(track)
##                        reads.in reads.out                          
## SRR19625066_1.fastq.gz    77543     71046  64789  66410  41654   73
## SRR19625067_1.fastq.gz    73819     66972  62259  63160  42911  161
## SRR19625068_1.fastq.gz   248353    225428 214306 217535 160215  235
## SRR19625069_1.fastq.gz   169479    152947 145208 146663 104690 1006
## SRR19625070_1.fastq.gz   191391    175990 166978 169110 124216    0
## SRR19625071_1.fastq.gz   131759    120852 113897 115399  83127    0
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")
rownames(track) <- samples

track_df <-
  track %>% 
  as.data.frame() %>%
  rownames_to_column(var="names") %>%
  mutate(perc_reads_retained = 100 * nochim / input)

DT::datatable(track_df)
track_df %>%
  pivot_longer(input:nochim, names_to = "read_type", values_to = "num_reads") %>%
  mutate(read_type = fct_relevel(read_type, 
                                 "input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")) %>%
  ggplot(aes(x = read_type, y = num_reads, fill = read_type)) + 
  geom_line(aes(group = names), color = "grey") + 
  geom_point(shape = 21, size = 3, alpha = 0.8) + 
  scale_fill_brewer(palette = "Spectral") + 
  labs(x = "Filtering Step", y = "Number of Sequences") + 
  theme_bw()

# Which sample had the lowest number of reads going into dada2?
lowest_reads <- track_df[order(track_df$input),]
head(lowest_reads)
##          names input filtered denoisedF denoisedR merged nochim
## 31 SRR19625096 31920    29016     26324     26749  16780    168
## 63 SRR19625128 34359    31462     27406     28216  15411      0
## 36 SRR19625101 39920    36454     32227     32971  18714      0
## 47 SRR19625112 40693    36722     33225     33674  21573    325
## 65 SRR19625130 44526    40464     36093     36717  22405    261
## 38 SRR19625103 56813    51892     47912     48768  33777    197
##    perc_reads_retained
## 31           0.5263158
## 63           0.0000000
## 36           0.0000000
## 47           0.7986632
## 65           0.5861744
## 38           0.3467516
# Which sample had the most reads filtered out?
most_filtered <- track_df[order(track_df$perc_reads_retained),]
head(most_filtered)
##          names  input filtered denoisedF denoisedR merged nochim
## 5  SRR19625070 191391   175990    166978    169110 124216      0
## 6  SRR19625071 131759   120852    113897    115399  83127      0
## 9  SRR19625074  93176    85364     78929     80445  51272      0
## 10 SRR19625075 165048   150286    141047    143754  97420      0
## 12 SRR19625077 113244   104183     97984     99293  70300      0
## 13 SRR19625078 157897   144019    135665    137372  96080      0
##    perc_reads_retained
## 5                    0
## 6                    0
## 9                    0
## 10                   0
## 12                   0
## 13                   0
least_filtered <-
  track_df[order(track_df$perc_reads_retained, decreasing=TRUE),]
head(least_filtered)
##          names  input filtered denoisedF denoisedR merged nochim
## 69 SRR19625134  99003    88641     81521     83089  53436    984
## 14 SRR19625079  71560    64744     60423     61359  42238    674
## 11 SRR19625076 182478   162646    153751    155883 116327   1684
## 34 SRR19625099 100181    90321     83658     85393  55555    832
## 47 SRR19625112  40693    36722     33225     33674  21573    325
## 27 SRR19625092 105528    95720     89854     91233  63180    680
##    perc_reads_retained
## 69           0.9939093
## 14           0.9418670
## 11           0.9228510
## 34           0.8304968
## 47           0.7986632
## 27           0.6443787
range(track_df$nochim)
## [1]    0 1684
median(track_df$nochim)
## [1] 15.5

This data was extremely filtered throughout the whole trimming and error reduction process. I may repeat the analysis with no trimming step, although it may still filter out just as much!!

Assign taxonomy

#Using silva database
taxa_db <- assignTaxonomy(nochimeras_asv_table, "/workdir/in_class_data/taxonomy/silva_nr99_v138.1_train_set.fa.gz")


#assigning species with silva database
taxa_species <- addSpecies(taxa_db, "/workdir/in_class_data/taxonomy/silva_species_assignment_v138.1.fa.gz")

taxa_print <- taxa_species 
rownames(taxa_print) <- NULL
head(taxa_print)
##      Kingdom    Phylum          Class           Order Family Genus Species
## [1,] "Archaea"  "Crenarchaeota" "Bathyarchaeia" NA    NA     NA    NA     
## [2,] "Archaea"  "Crenarchaeota" "Bathyarchaeia" NA    NA     NA    NA     
## [3,] "Archaea"  "Crenarchaeota" "Bathyarchaeia" NA    NA     NA    NA     
## [4,] "Bacteria" NA              NA              NA    NA     NA    NA     
## [5,] "Bacteria" NA              NA              NA    NA     NA    NA     
## [6,] "Bacteria" NA              NA              NA    NA     NA    NA

Exporting data

Output will include:

  • Two ASV Tables
  • an ASV fasta file
  • all figures generated from the analysis
####ASV Export
#Pull ASV sequences
asv_seqs <- colnames(nochimeras_asv_table)


#Headers for asv seqs
asv_headers <- vector(dim(nochimeras_asv_table)[2], mode="character")
asv_headers[1:5]
## [1] "" "" "" "" ""
#Fill vector with asv names
for (i in 1:dim(nochimeras_asv_table)[2]) {
  asv_headers[i] <- paste("ASV", i, sep="_")
}

head(asv_headers)
## [1] "ASV_1" "ASV_2" "ASV_3" "ASV_4" "ASV_5" "ASV_6"
###Rename ASVs and write as table
asv_table <- t(nochimeras_asv_table)
row.names(asv_table) <- sub(">","", asv_headers)


####Taxonomy table
tax_table <- taxa_species %>%
  as.data.frame() %>%
  rownames_to_column(var="asv_seqs")

head(tax_table)
##                                                                                                                                                                                                                                                                                                       asv_seqs
## 1 CAGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGCCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCTCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAACAACGAAGAATTTCGTTTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## 2 CCGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGCCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCTCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAACAACGAAGAATTTCGTTTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## 3  CCGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGTCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCCCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAAAACCTCGAAAGAGGATTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## 4 AGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCG
## 5 CGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCG
## 6 CGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCT
##    Kingdom        Phylum         Class Order Family Genus Species
## 1  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA>
## 2  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA>
## 3  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA>
## 4 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA>
## 5 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA>
## 6 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA>
#add ASV names
rownames(tax_table) <- rownames(asv_table)
head(tax_table)
##                                                                                                                                                                                                                                                                                                           asv_seqs
## ASV_1 CAGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGCCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCTCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAACAACGAAGAATTTCGTTTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## ASV_2 CCGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGCCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCTCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAACAACGAAGAATTTCGTTTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## ASV_3  CCGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGTCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCCCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAAAACCTCGAAAGAGGATTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## ASV_4 AGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCG
## ASV_5 CGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCG
## ASV_6 CGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCT
##        Kingdom        Phylum         Class Order Family Genus Species
## ASV_1  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA>
## ASV_2  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA>
## ASV_3  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA>
## ASV_4 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA>
## ASV_5 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA>
## ASV_6 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA>
#New column with ASV names
asv_taxa <- 
  tax_table %>%
  mutate(ASV=rownames(asv_table)) %>%
  dplyr::select(Kingdom, Phylum, Class, Order, Family, Genus, Species, ASV, asv_seqs)

head(asv_taxa)
##        Kingdom        Phylum         Class Order Family Genus Species   ASV
## ASV_1  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA> ASV_1
## ASV_2  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA> ASV_2
## ASV_3  Archaea Crenarchaeota Bathyarchaeia  <NA>   <NA>  <NA>    <NA> ASV_3
## ASV_4 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA> ASV_4
## ASV_5 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA> ASV_5
## ASV_6 Bacteria          <NA>          <NA>  <NA>   <NA>  <NA>    <NA> ASV_6
##                                                                                                                                                                                                                                                                                                           asv_seqs
## ASV_1 CAGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGCCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCTCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAACAACGAAGAATTTCGTTTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## ASV_2 CCGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGCCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCTCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAACAACGAAGAATTTCGTTTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## ASV_3  CCGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGTCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCCCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAAAACCTCGAAAGAGGATTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC
## ASV_4 AGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCG
## ASV_5 CGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCG
## ASV_6 CGGGGTTTCTAATCCCGTTCGCTCCCCCAGCTTTCGTCCCTCACCGTCGGGCGCGTTTTAGTCGGCCGCCTTCGCCACTGGTGGTCCTCCCGGGATTATAAGACTGCGAAACGAAATTCTTCGTTGTTTCGCCCCTACCCCGGGACTACCGCCGACCTCCCCCGCCCCCAAGCCGGAAAGTATTCCCCGCGAGCCAACGATTGGATCGTTGGATTTTACGGAGAACTTTTCCAGCCGGCTACGGACGCTTTAGGCCCAATAATCGTCCCAACCACTCGAGGAGCTGGTATTACCGCGGCT
stopifnot(asv_taxa$ASV == rownames(asv_taxa), rownames(asv_taxa) == rownames(asv_table))

Write files

Files to be written:

  1. asv_counts.tsv: asv count table
  2. asv_counts_with_seqnames.tsv: asv headers with the entire sequence
  3. asvs.fasta: a fasta file with ASV names from asv_counts.tsv and the sequences. this can be used to build phylogenies
  4. a copy of asvs.fasta in a new folder
  5. a taxonomy table
  6. track_read_counts.RData to track the amount of reads we lost
#Count table with numbered ASV names
write.table(asv_table, "data/01_dada2/asv_counts.tsv", sep = "\t", quote = FALSE, col.names = NA)

#Table with ASV sequence names
write.table(nochimeras_asv_table, "data/01_dada2/asv_counts_with_seqnames.tsv", sep="\t", quote=FALSE,col.names=NA)

#Fasta file for reference later
asv_fasta <- c(rbind(asv_headers, asv_seqs))
head(asv_fasta)
## [1] "ASV_1"                                                                                                                                                                                                                                                                                                       
## [2] "CAGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGCCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCTCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAACAACGAAGAATTTCGTTTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC"
## [3] "ASV_2"                                                                                                                                                                                                                                                                                                       
## [4] "CCGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGCCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCTCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAACAACGAAGAATTTCGTTTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC"
## [5] "ASV_3"                                                                                                                                                                                                                                                                                                       
## [6] "CCGCCGCGGTAATACCAGCTCCTCGAGTGGTTGGGACGATTATTGGGTCTAAAGCGTCCGTAGCCGGCTGGAAAAGTTCTCCGTAAAATCCAACGATCCAATCGTTGGCCCGCGGGGAATACTTTCCGGCTTGGGGGCGGGGGAGGTCGGCGGTAGTCCCGGGGTAGGGGCGAAAAACCTCGAAAGAGGATTCGCAGTCTTATAATCCCGGGAGGACCACCAGTGGCGAAGGCGGCCGACTAAAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACGGGATTAGAAACCCC"
write(asv_fasta, "data/01_dada2/asvs.fasta")


#Save taxonomy tables
write.table(asv_taxa, "data/01_dada2/asv_taxonomy.tsv", sep="\t", quote=FALSE, col.names=NA)

# Save to an RData object
# No chimeras asv table
save(nochimeras_asv_table, file="data/01_dada2/nochimeras_asv_table.Rdata")
#asv counts
save(asv_table, file="data/01_dada2/asv_counts.RData")
#tracking counts dataframe
save(track_df, file="data/01_dada2/track_read_counts.Rdata")

Check render time

end_time <- Sys.time()
end_time
## [1] "2024-03-12 17:40:06 EDT"
elapsed_time <- round((end_time - start_time), 3)
elapsed_time
## Time difference of 2.114 hours

Session Information

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.2 (2023-10-31)
##  os       Rocky Linux 9.0 (Blue Onyx)
##  system   x86_64, linux-gnu
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/New_York
##  date     2024-03-12
##  pandoc   3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package              * version    date (UTC) lib source
##  abind                  1.4-5      2016-07-21 [2] CRAN (R 4.3.2)
##  ade4                   1.7-22     2023-02-06 [1] CRAN (R 4.3.2)
##  ape                    5.7-1      2023-03-13 [2] CRAN (R 4.3.2)
##  Biobase                2.62.0     2023-10-24 [2] Bioconductor
##  BiocGenerics           0.48.1     2023-11-01 [2] Bioconductor
##  BiocManager          * 1.30.22    2023-08-08 [2] CRAN (R 4.3.2)
##  BiocParallel           1.36.0     2023-10-24 [2] Bioconductor
##  biomformat             1.30.0     2023-10-24 [1] Bioconductor
##  Biostrings             2.70.1     2023-10-25 [2] Bioconductor
##  bitops                 1.0-7      2021-04-24 [2] CRAN (R 4.3.2)
##  bslib                  0.5.1      2023-08-11 [2] CRAN (R 4.3.2)
##  cachem                 1.0.8      2023-05-01 [2] CRAN (R 4.3.2)
##  callr                  3.7.3      2022-11-02 [2] CRAN (R 4.3.2)
##  cli                    3.6.1      2023-03-23 [2] CRAN (R 4.3.2)
##  cluster                2.1.4      2022-08-22 [2] CRAN (R 4.3.2)
##  codetools              0.2-19     2023-02-01 [2] CRAN (R 4.3.2)
##  colorspace             2.1-0      2023-01-23 [2] CRAN (R 4.3.2)
##  cowplot              * 1.1.3      2024-01-22 [1] CRAN (R 4.3.2)
##  crayon                 1.5.2      2022-09-29 [2] CRAN (R 4.3.2)
##  crosstalk              1.2.0      2021-11-04 [2] CRAN (R 4.3.2)
##  dada2                * 1.30.0     2023-10-24 [1] Bioconductor
##  data.table             1.14.8     2023-02-17 [2] CRAN (R 4.3.2)
##  DelayedArray           0.28.0     2023-10-24 [2] Bioconductor
##  deldir                 1.0-9      2023-05-17 [2] CRAN (R 4.3.2)
##  devtools             * 2.4.4      2022-07-20 [2] CRAN (R 4.2.1)
##  digest                 0.6.33     2023-07-07 [2] CRAN (R 4.3.2)
##  dplyr                * 1.1.3      2023-09-03 [2] CRAN (R 4.3.2)
##  DT                   * 0.32       2024-02-19 [1] CRAN (R 4.3.2)
##  ellipsis               0.3.2      2021-04-29 [2] CRAN (R 4.3.2)
##  evaluate               0.23       2023-11-01 [2] CRAN (R 4.3.2)
##  fansi                  1.0.5      2023-10-08 [2] CRAN (R 4.3.2)
##  farver                 2.1.1      2022-07-06 [2] CRAN (R 4.3.2)
##  fastmap                1.1.1      2023-02-24 [2] CRAN (R 4.3.2)
##  forcats              * 1.0.0      2023-01-29 [1] CRAN (R 4.3.2)
##  foreach                1.5.2      2022-02-02 [2] CRAN (R 4.3.2)
##  fs                     1.6.3      2023-07-20 [2] CRAN (R 4.3.2)
##  generics               0.1.3      2022-07-05 [2] CRAN (R 4.3.2)
##  GenomeInfoDb           1.38.0     2023-10-24 [2] Bioconductor
##  GenomeInfoDbData       1.2.11     2023-11-07 [2] Bioconductor
##  GenomicAlignments      1.38.0     2023-10-24 [2] Bioconductor
##  GenomicRanges          1.54.1     2023-10-29 [2] Bioconductor
##  ggplot2              * 3.5.0      2024-02-23 [1] CRAN (R 4.3.2)
##  glue                   1.6.2      2022-02-24 [2] CRAN (R 4.3.2)
##  gtable                 0.3.4      2023-08-21 [2] CRAN (R 4.3.2)
##  highr                  0.10       2022-12-22 [2] CRAN (R 4.3.2)
##  hms                    1.1.3      2023-03-21 [1] CRAN (R 4.3.2)
##  htmltools              0.5.7      2023-11-03 [2] CRAN (R 4.3.2)
##  htmlwidgets            1.6.2      2023-03-17 [2] CRAN (R 4.3.2)
##  httpuv                 1.6.12     2023-10-23 [2] CRAN (R 4.3.2)
##  hwriter                1.3.2.1    2022-04-08 [1] CRAN (R 4.3.2)
##  igraph                 1.5.1      2023-08-10 [2] CRAN (R 4.3.2)
##  iNEXT                * 3.0.0      2022-08-29 [1] CRAN (R 4.3.2)
##  interp                 1.1-6      2024-01-26 [1] CRAN (R 4.3.2)
##  IRanges                2.36.0     2023-10-24 [2] Bioconductor
##  iterators              1.0.14     2022-02-05 [2] CRAN (R 4.3.2)
##  jpeg                   0.1-10     2022-11-29 [1] CRAN (R 4.3.2)
##  jquerylib              0.1.4      2021-04-26 [2] CRAN (R 4.3.2)
##  jsonlite               1.8.7      2023-06-29 [2] CRAN (R 4.3.2)
##  knitr                  1.45       2023-10-30 [2] CRAN (R 4.3.2)
##  labeling               0.4.3      2023-08-29 [2] CRAN (R 4.3.2)
##  later                  1.3.1      2023-05-02 [2] CRAN (R 4.3.2)
##  lattice              * 0.21-9     2023-10-01 [2] CRAN (R 4.3.2)
##  latticeExtra           0.6-30     2022-07-04 [1] CRAN (R 4.3.2)
##  lifecycle              1.0.3      2022-10-07 [2] CRAN (R 4.3.2)
##  lubridate            * 1.9.3      2023-09-27 [1] CRAN (R 4.3.2)
##  magrittr               2.0.3      2022-03-30 [2] CRAN (R 4.3.2)
##  MASS                   7.3-60     2023-05-04 [2] CRAN (R 4.3.2)
##  Matrix                 1.6-1.1    2023-09-18 [2] CRAN (R 4.3.2)
##  MatrixGenerics         1.14.0     2023-10-24 [2] Bioconductor
##  matrixStats            1.1.0      2023-11-07 [2] CRAN (R 4.3.2)
##  memoise                2.0.1      2021-11-26 [2] CRAN (R 4.3.2)
##  mgcv                   1.9-0      2023-07-11 [2] CRAN (R 4.3.2)
##  mime                   0.12       2021-09-28 [2] CRAN (R 4.3.2)
##  miniUI                 0.1.1.1    2018-05-18 [2] CRAN (R 4.3.2)
##  multtest               2.58.0     2023-10-24 [1] Bioconductor
##  munsell                0.5.0      2018-06-12 [2] CRAN (R 4.3.2)
##  nlme                   3.1-163    2023-08-09 [2] CRAN (R 4.3.2)
##  pacman               * 0.5.1      2019-03-11 [1] CRAN (R 4.3.2)
##  patchwork            * 1.2.0.9000 2024-03-12 [1] Github (thomasp85/patchwork@d943757)
##  permute              * 0.9-7      2022-01-27 [1] CRAN (R 4.3.2)
##  phyloseq             * 1.46.0     2023-10-24 [1] Bioconductor
##  pillar                 1.9.0      2023-03-22 [2] CRAN (R 4.3.2)
##  pkgbuild               1.4.2      2023-06-26 [2] CRAN (R 4.3.2)
##  pkgconfig              2.0.3      2019-09-22 [2] CRAN (R 4.3.2)
##  pkgload                1.3.3      2023-09-22 [2] CRAN (R 4.3.2)
##  plyr                   1.8.9      2023-10-02 [2] CRAN (R 4.3.2)
##  png                    0.1-8      2022-11-29 [2] CRAN (R 4.3.2)
##  prettyunits            1.2.0      2023-09-24 [2] CRAN (R 4.3.2)
##  processx               3.8.2      2023-06-30 [2] CRAN (R 4.3.2)
##  profvis                0.3.8      2023-05-02 [2] CRAN (R 4.3.2)
##  promises               1.2.1      2023-08-10 [2] CRAN (R 4.3.2)
##  ps                     1.7.5      2023-04-18 [2] CRAN (R 4.3.2)
##  purrr                * 1.0.2      2023-08-10 [2] CRAN (R 4.3.2)
##  R6                     2.5.1      2021-08-19 [2] CRAN (R 4.3.2)
##  RColorBrewer           1.1-3      2022-04-03 [2] CRAN (R 4.3.2)
##  Rcpp                 * 1.0.11     2023-07-06 [2] CRAN (R 4.3.2)
##  RcppParallel           5.1.7      2023-02-27 [2] CRAN (R 4.3.2)
##  RCurl                  1.98-1.13  2023-11-02 [2] CRAN (R 4.3.2)
##  readr                * 2.1.5      2024-01-10 [1] CRAN (R 4.3.2)
##  remotes                2.4.2.1    2023-07-18 [2] CRAN (R 4.3.2)
##  reshape2               1.4.4      2020-04-09 [2] CRAN (R 4.3.2)
##  rhdf5                  2.46.1     2023-11-29 [1] Bioconductor 3.18 (R 4.3.2)
##  rhdf5filters           1.14.1     2023-11-06 [1] Bioconductor
##  Rhdf5lib               1.24.2     2024-02-07 [1] Bioconductor 3.18 (R 4.3.2)
##  rlang                  1.1.2      2023-11-04 [2] CRAN (R 4.3.2)
##  rmarkdown              2.25       2023-09-18 [2] CRAN (R 4.3.2)
##  Rsamtools              2.18.0     2023-10-24 [2] Bioconductor
##  rstudioapi             0.15.0     2023-07-07 [2] CRAN (R 4.3.2)
##  S4Arrays               1.2.0      2023-10-24 [2] Bioconductor
##  S4Vectors              0.40.1     2023-10-26 [2] Bioconductor
##  sass                   0.4.7      2023-07-15 [2] CRAN (R 4.3.2)
##  scales                 1.3.0      2023-11-28 [2] CRAN (R 4.3.2)
##  sessioninfo            1.2.2      2021-12-06 [2] CRAN (R 4.3.2)
##  shiny                  1.7.5.1    2023-10-14 [2] CRAN (R 4.3.2)
##  ShortRead              1.60.0     2023-10-24 [1] Bioconductor
##  SparseArray            1.2.1      2023-11-05 [2] Bioconductor
##  stringi                1.7.12     2023-01-11 [2] CRAN (R 4.3.2)
##  stringr              * 1.5.0      2022-12-02 [2] CRAN (R 4.3.2)
##  SummarizedExperiment   1.32.0     2023-10-24 [2] Bioconductor
##  survival               3.5-7      2023-08-14 [2] CRAN (R 4.3.2)
##  tibble               * 3.2.1      2023-03-20 [2] CRAN (R 4.3.2)
##  tidyr                * 1.3.0      2023-01-24 [2] CRAN (R 4.3.2)
##  tidyselect             1.2.0      2022-10-10 [2] CRAN (R 4.3.2)
##  tidyverse            * 2.0.0      2023-02-22 [1] CRAN (R 4.3.2)
##  timechange             0.3.0      2024-01-18 [1] CRAN (R 4.3.2)
##  tzdb                   0.4.0      2023-05-12 [1] CRAN (R 4.3.2)
##  urlchecker             1.0.1      2021-11-30 [2] CRAN (R 4.3.2)
##  usethis              * 2.2.2      2023-07-06 [2] CRAN (R 4.3.2)
##  utf8                   1.2.4      2023-10-22 [2] CRAN (R 4.3.2)
##  vctrs                  0.6.4      2023-10-12 [2] CRAN (R 4.3.2)
##  vegan                * 2.6-4      2022-10-11 [1] CRAN (R 4.3.2)
##  withr                  2.5.2      2023-10-30 [2] CRAN (R 4.3.2)
##  xfun                   0.41       2023-11-01 [2] CRAN (R 4.3.2)
##  xtable                 1.8-4      2019-04-21 [2] CRAN (R 4.3.2)
##  XVector                0.42.0     2023-10-24 [2] Bioconductor
##  yaml                   2.3.7      2023-01-23 [2] CRAN (R 4.3.2)
##  zlibbioc               1.48.0     2023-10-24 [2] Bioconductor
## 
##  [1] /home/mld253/R/x86_64-pc-linux-gnu-library/4.3
##  [2] /programs/R-4.3.2/library
## 
## ──────────────────────────────────────────────────────────────────────────────